#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <image3d.h>
#include "gcuda.h"
#include <transform.h>
#include <limits>

using namespace std;
using namespace i3d;

#define gpuErrchk(ans) { gpuassert::gpuAssert((ans), __FILE__, __LINE__, true); }

// computes maximum of 2 values
#ifndef MAX
#define MAX( a, b ) ( ((a) > (b)) ? (a) : (b) )
#endif

// computes minimum of 2 values
#ifndef MIN
#define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif

// computes max or min of 2 values (assumes operation_t operation defined in block)
#ifndef MYOP
#define MYOP( a, b ) ( ( operation == ERODE ) ? ( MIN ( ( a ), ( b ) ) ) : ( MAX ( ( a ), ( b ) ) ) )
#endif

// computes 1D array index from x, y (assumes int width defined in block)
#ifndef IDX
#define IDX( x, y ) ( ( x ) + ( ( y ) * ( width ) ) )
#endif

// computes x from 1D array index (assumes int width defined in block)
#ifndef XOF
#define XOF( x ) ( x % width )
#endif

// computes y from 1D array index (assumes int width defined in block)
#ifndef YOF
#define YOF( x ) ( (int)( x / width ) )
#endif

// computes absolute value
#ifndef ABS
#define ABS( x ) ( ( x ) > 0 ? ( x ) : -( x ) )
#endif

namespace gpuassert {
	inline void gpuAssert(cudaError_t code, char *file, int line, bool abort)
	{
		if (code != cudaSuccess)
		{
			fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
			if (abort) exit(code);
		}
	}
}

namespace gcuda {
	const int blockSizeX = 64;
#if __CUDA_ARCH__ >= 200
	const int blockSizeY = 16;
#else
	const int blockSizeY = 8;
#endif
	typedef enum {DILATE, ERODE} operation_t;

	bool* offsets_d;

	// round off to next highest power of 2
	unsigned int nextHighestPower2(unsigned int v) {
		v--;
		v |= v >> 1;
		v |= v >> 2;
		v |= v >> 4;
		v |= v >> 8;
		v |= v >> 16;
		v++;
		return v;
	}

	// parallel reduction over T data
	template <typename T> __global__ void ReduceGrays(const T *g_idata, unsigned int *g_odata, const size_t n) {
		extern __shared__ unsigned int sdata[];
		unsigned int tid = threadIdx.x;
		unsigned int i = blockIdx.x*(blockDim.x*2) + tid;
		unsigned int gridSize = blockDim.x*2*gridDim.x;
		unsigned int mySum = 0;

#if __CUDA_ARCH__ >= 200
		//printf("[%d]", tid);
#endif

		while (i < n) {
			mySum += g_idata[i];
			if(i + blockDim.x < n) mySum += g_idata[i+blockDim.x];
			i += gridSize;
		}

		// each thread puts its local sum into shared memory
		sdata[tid] = mySum;
		__syncthreads();

		// do reduction in shared mem
		if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
		if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
		if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = mySum = mySum + sdata[tid +  64]; } __syncthreads(); }

		if (tid < 32)
		{
			volatile unsigned int* smem = sdata;
			if (blockDim.x >=  64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
			if (blockDim.x >=  32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
			if (blockDim.x >=  16) { smem[tid] = mySum = mySum + smem[tid +  8]; }
			if (blockDim.x >=   8) { smem[tid] = mySum = mySum + smem[tid +  4]; }
			if (blockDim.x >=   4) { smem[tid] = mySum = mySum + smem[tid +  2]; }
			if (blockDim.x >=   2) { smem[tid] = mySum = mySum + smem[tid +  1]; }
		}
		if (tid == 0) g_odata[blockIdx.x] = sdata[0];
	}

	// parallel reduction over boolean data
	template<unsigned int blockSize> __global__ void ReduceBools(const bool *g_idata, int *g_odata, const size_t n) {
		extern __shared__ int sdata_b[];
		unsigned int tid = threadIdx.x;
		unsigned int i = blockIdx.x*(blockSize*2) + tid;
		unsigned int gridSize = blockSize*2*gridDim.x;
		unsigned int mySum = 0;

		while (i < n) {
			mySum += (g_idata[i] ? 1 : 0);
			if(i + blockSize < n) mySum += (g_idata[i+blockSize] ? 1 : 0);
			i += gridSize;
		}

		// each thread puts its local sum into shared memory
		sdata_b[tid] = mySum;
		__syncthreads();

		// do reduction in shared mem
		if (blockSize >= 512) { if (tid < 256) { sdata_b[tid] = mySum = mySum + sdata_b[tid + 256]; } __syncthreads(); }
		if (blockSize >= 256) { if (tid < 128) { sdata_b[tid] = mySum = mySum + sdata_b[tid + 128]; } __syncthreads(); }
		if (blockSize >= 128) { if (tid <  64) { sdata_b[tid] = mySum = mySum + sdata_b[tid +  64]; } __syncthreads(); }

		if (tid < 32)
		{
			volatile int* smem = sdata_b;
			if (blockSize >=  64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
			if (blockSize >=  32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
			if (blockSize >=  16) { smem[tid] = mySum = mySum + smem[tid +  8]; }
			if (blockSize >=   8) { smem[tid] = mySum = mySum + smem[tid +  4]; }
			if (blockSize >=   4) { smem[tid] = mySum = mySum + smem[tid +  2]; }
			if (blockSize >=   2) { smem[tid] = mySum = mySum + smem[tid +  1]; }
		}
		if (tid == 0) g_odata[blockIdx.x] = sdata_b[0];
	}

	// Difference of two images. out_intensity stores intensity difference, out_different stores whether
	// they are in fact different or not (in one given pixel).
	template<typename T> __global__ void Diff(const T *in_a, const T *in_b, T *out_intensity, bool *out_different, const int n) {
		int i = blockIdx.x * blockDim.x + threadIdx.x;
		__shared__ volatile int numOfDifferent;

		if (i >= n) return;
		out_intensity[i] = in_a[i] - in_b[i];
		out_different[i] = in_a[i] - in_b[i] > 0;
	}

	// Helper function to get min or max from upper and lower rows (determined by rowOffset and range).
	// To be used with the Bresenham midpoint algorithm from MorphCore.
	template<typename T>
	__device__ T GetExtremeFromUprAndLwrRow(const operation_t operation, const T currentExtreme, const T *in, const int tx, const unsigned int N, const int width, const int range, const int rowOffset) {
		T myExtreme = currentExtreme;

		int x = XOF(tx);
		int y = YOF(tx);

		if (0 == rowOffset) {
			// get the row's minimum in range
			int leftbound = IDX( MAX(0, x - range), y );
			int rightbound = IDX( MIN(x + range, width - 1), y );
			for (int i = leftbound; i <= rightbound; ++i) {
				myExtreme = MYOP(myExtreme, in[i]);
			}
		} else {
			// get upper row minimum in range
			if (0 <= y - rowOffset) {
				int leftbound = IDX( MAX(0, x - range), y - rowOffset );
				int rightbound = IDX( MIN(x + range, width - 1), y - rowOffset );
				for (int i = leftbound; i <= rightbound; ++i) {
					myExtreme = MYOP(myExtreme, in[i]);
				}
			}

			// get lower row minimum in range
			if (width * (y + rowOffset) < N) {
				int leftbound = IDX( MAX(0, x - range), y + rowOffset );
				int rightbound = IDX( MIN(x + range, width - 1), y + rowOffset );
				for (int i = leftbound; i <= rightbound; ++i) {
					myExtreme = MYOP(myExtreme, in[i]);
				}
			}
		}

		return myExtreme;
	}

	// CUDA Kernel -- used for image erosion or dilation with a circular structure element of radius "radius"
	template<typename T> __global__ void MorphGpu(const operation_t operation, const T *in, T *out, const unsigned int N, const int width, const int radius, const T totalExtreme, const bool* offsets)
	{
		int tx = blockIdx.x * blockDim.x + threadIdx.x;
		int ty = blockIdx.y * blockDim.y + threadIdx.y;
		int diameter = radius * 2 + 1;

		int thx = IDX(tx, ty);

		out[thx] = in[thx];
		__syncthreads();

		// cycle through the offsets
		for (int y = 0; y < diameter; ++y)
		{
			for (int x = 0; x < diameter; ++x)
			{
				// if the current offset belongs in the SE
				if (offsets[y*diameter + x])
				{
					int xofs = tx + x - radius;
					int yofs = ty + y - radius;
					T oldval = out[thx];
					int idx = IDX(xofs, yofs);

					if ( tx < width									// ensure no ghosting from previous pixel rows
						&& thx < N										// only compute pixels within the image
						&& xofs >= 0 && xofs < width	// the offset x value should be within 0..width
						&& idx >= 0 && idx < N				// the final index value should be within 0..N
						)
					{
						T newval = in[idx];
						if (operation == ERODE)
						{
							if (oldval > newval)
							{
								out[thx] = newval;
							}
						}
						else
						{
							if (oldval < newval)
							{
								out[thx] = newval;
							}
						}
					}
				}
				__syncthreads();
			}
		}
	}

	// fill a row of a Structure Element
	void FillOffsetsRow(bool* offsets, int width, int range, int rowOffset) {
		int y = rowOffset + (int)(width / 2);
		for (int x = 0; x < width; ++x) {
			offsets[y*width + x] = x >= (int)(width/2 - range) && x <= (int)(width/2 + range);
		}
	}

	bool* GetCircularSEOffsets(const int r)
	{
		int diameter = 2*r + 1;
		// precompute structure element
		bool* offsets = (bool*)malloc(diameter*diameter);

		// fill in the initial row of the circular SE (Y coord = 0)
		FillOffsetsRow(offsets, diameter, r, 0);

		int error = 1 - r;
		int errorY = 1;
		int errorX = -2 * r;
		int x = r, y = 0;

		while (y < x - 1) {
			if (error > 0)
			{
				// just before the computed x coordinate changes
				FillOffsetsRow(offsets, diameter, y, x);
				FillOffsetsRow(offsets, diameter, y, -x);

				--x;
				errorX += 2;
				error += errorX;
			}

			y++;
			errorY += 2;
			error += errorY;

			FillOffsetsRow(offsets, diameter, x, y);
			FillOffsetsRow(offsets, diameter, x, -y);
		}
		// final iteration
		FillOffsetsRow(offsets, diameter, y, x);
		FillOffsetsRow(offsets, diameter, y, -x);

		return offsets;
	}

	// opening - I used this function to test stuff
	template<typename T> Image3d<T> OpenGpu(Image3d<T> img, const unsigned int radius) {
		unsigned int width = img.GetWidth();
		unsigned int height = img.GetHeight();
		unsigned int w = nextHighestPower2(width);
		unsigned int h = nextHighestPower2(height);
		const int n = width * height;
		const int N = w * h;
		const int diameter = radius*2 + 1;

		Image3d<T>* rslt = new Image3d<T>(img);

		T *vx = rslt->GetFirstVoxelAddr();

		// precompute structure element
		bool* offsets = GetCircularSEOffsets(radius);

		// kernel parameters
		dim3 dimBlock2( blockSizeX, blockSizeY );
		dim3 dimGrid2( ceil( w / ((float)(blockSizeX))), ceil ( h / ((float)blockSizeY)) );

		// source voxel array on device (orig)
		T *vx_d;

		// result voxel array on device (for result of erosion)
		T *vxr1_d;

		// result voxel array on device (for result of dilation)
		T *vxr2_d;
		gpuErrchk(cudaGetLastError());
		// allocate memory on device
		gpuErrchk( cudaMalloc( (void**)&offsets_d, diameter*diameter ) );
		gpuErrchk( cudaMemcpy( offsets_d, offsets, diameter*diameter, cudaMemcpyHostToDevice ) );

		gpuErrchk( cudaMalloc( (void**)&vx_d, n * sizeof(T) ) );
		gpuErrchk( cudaMemcpy( vx_d, vx, n * sizeof(T), cudaMemcpyHostToDevice ) );

		gpuErrchk( cudaMalloc( (void**)&vxr1_d, n * sizeof(T) ) );
		gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n * sizeof(T), cudaMemcpyDeviceToDevice ) );

		gpuErrchk( cudaMalloc( (void**)&vxr2_d, n * sizeof(T) ) );
		gpuErrchk( cudaMemcpy( vxr2_d, vx_d, n * sizeof(T), cudaMemcpyDeviceToDevice ) );

		MorphGpu<T><<<dimGrid2, dimBlock2>>>(ERODE, vx_d, vxr1_d, n, width, radius, 0, offsets_d);
		cudaDeviceSynchronize();
		gpuErrchk(cudaGetLastError());

		MorphGpu<T><<<dimGrid2, dimBlock2>>>(DILATE, vxr1_d, vxr2_d, n, width, radius, numeric_limits<T>::max(), offsets_d);
		cudaDeviceSynchronize();
		gpuErrchk(cudaGetLastError());

		gpuErrchk( cudaMemcpy( vx, vxr2_d, n * sizeof(T), cudaMemcpyDeviceToHost ) );

		// free device memory
		gpuErrchk( cudaFree( vx_d ) );
		gpuErrchk( cudaFree( vxr1_d ) );
		gpuErrchk( cudaFree( vxr2_d ) );

		free(offsets);

		return (*rslt);
	}

	// The actual granulodescriptor -- not meant to be called from outside.
	template<typename T> void GranuloGpu(Image3d<T> img, const unsigned int numberOfOpenings) {
		unsigned int width = img.GetWidth();
		unsigned int height = img.GetHeight();
		unsigned int w = nextHighestPower2(width);
		unsigned int h = nextHighestPower2(height);
		const size_t n = width * height;
		const size_t N = w * h;

		cerr << "Started GPU implementation";

		int iterator = 0, orig_intensity_sum = 0, nonzero_pixels = 0;
		int sum_intensity = 0, num_diff_pixels = 0;

		unsigned int *vec1 = new unsigned int[numberOfOpenings + 1];
		unsigned int *vec2 = new unsigned int[numberOfOpenings + 1];
		vec1[0] = 0;
		vec2[0] = 0;

		Image3d<T>* rslt = new Image3d<T>();
		rslt->MakeRoom(width, height, 1); // set pic size
		// copy metadata from source
		rslt->SetOffset(img.GetOffset());
		rslt->SetResolution(img.GetResolution());

		// copy the image into the resized output image and use the loop to compute the first element
		// of the intensity array and maximum/minimum pixel value at the same time
		T vxl, totalMax, totalMin;
		totalMax = img.GetVoxel(0, 0, 0);
		totalMin = totalMax;
		for (unsigned int y = 0; y < height; ++y) {
			for (unsigned int x = 0; x < width; ++x) {
				vxl = img.GetVoxel(x, y, 0);
				totalMax = MAX(vxl, totalMax);
				totalMin = MIN(vxl, totalMin);
				rslt->SetVoxel(x, y, 0, vxl);
				vec1[0] += vxl;
				if (vxl > 0) ++nonzero_pixels;
			}
		}

		// cerr << "Original picture intensity sum: " << vec1[0] << ", " << nonzero_pixels << " nonzero_pixels" << endl;
		orig_intensity_sum = vec1[0];

		T *vx = rslt->GetFirstVoxelAddr();

		// kernel parameters
		dim3 dimBlock2( blockSizeX, blockSizeY );
		dim3 dimBlock1( blockSizeX );
		dim3 dimGrid2( ceil( w / (float)blockSizeX), ceil( h / (float)blockSizeY) );
		dim3 dimGrid1( ceil( N / (float)blockSizeX) );
		// printf("velikost je %d %d\n", blockSize, dimGrid.x);

		// source voxel array on device (orig)
		T *vx_d;

		// result voxel array on device (for result of erosion)
		T *vxr1_d;

		// second result voxel array on device (for result of dilation of vxr1_d)
		T *vxr2_d;

		// reduction output on device
		unsigned int *rdc_d;

		// reduction output on host
		unsigned int *rdc;

		// boolean reduction output on device
		int *brdc_d;

		// boolean reduction output on host
		int *brdc;

		// image difference matrix (device)
		bool *diff_d;

		// allocate memory on device
		gpuErrchk( cudaMalloc( (void**)&vx_d, n * sizeof(T) ) );
		gpuErrchk( cudaMemcpy( vx_d, vx, n * sizeof(T), cudaMemcpyHostToDevice ) );

		gpuErrchk( cudaMalloc( (void**)&vxr1_d, n * sizeof(T) ) );
		gpuErrchk( cudaMalloc( (void**)&vxr2_d, n * sizeof(T) ) );
		gpuErrchk( cudaMalloc( (void**)&diff_d, n * sizeof(bool) ) );
		gpuErrchk( cudaMalloc( (void**)&offsets_d, numberOfOpenings*numberOfOpenings ) );

		gpuErrchk( cudaMalloc( (void**)&rdc_d, (dimGrid1.x * sizeof(unsigned int)) ) );
		rdc = (unsigned int *) malloc(dimGrid1.x * sizeof(unsigned int));
		gpuErrchk( cudaMalloc( (void**)&brdc_d, (dimGrid1.x * sizeof(int)) ) );
		brdc = (int *) malloc(dimGrid1.x * sizeof(int));

		// perform all the openings and granulodescriptor-related computations
		for (int k = 1; k < numberOfOpenings + 1; ++k) {
			bool* offsets = GetCircularSEOffsets(k);
			int diameter = k*2 + 1;
			gpuErrchk( cudaMemcpy( offsets_d, offsets, numberOfOpenings*numberOfOpenings, cudaMemcpyHostToDevice ) );

			// image opening
			MorphGpu<T><<<dimGrid2, dimBlock2>>>(ERODE, vx_d, vxr1_d, n, width, k, totalMin, offsets_d);
			cudaDeviceSynchronize(); gpuErrchk(cudaGetLastError());
			MorphGpu<T><<<dimGrid2, dimBlock2>>>(DILATE, vxr1_d, vxr2_d, n, width, k, totalMax, offsets_d);
			cudaDeviceSynchronize(); gpuErrchk(cudaGetLastError());

			// difference between opened and original image
			// (difference image and difference boolean matrix)
			Diff<T><<<dimGrid1, dimBlock1>>>(vx_d, vxr2_d, vxr1_d, diff_d, n);
			cudaDeviceSynchronize(); gpuErrchk(cudaGetLastError());

			// compute reduction of difference image
			int sdata_size1 = (sizeof(unsigned int)*blockSizeX);
			ReduceGrays<T><<<dimGrid1, dimBlock1, sdata_size1>>>(vxr1_d, rdc_d, n);
			cudaDeviceSynchronize(); gpuErrchk(cudaGetLastError());

			// compute reduction of difference matrix
			int sdata_bsize = (sizeof(int)*blockSizeX);
			ReduceBools<blockSizeX><<<dimGrid1, dimBlock1, sdata_bsize>>>(diff_d, brdc_d, n);
			cudaDeviceSynchronize(); gpuErrchk(cudaGetLastError());

			// copy reduction data to host and finish the sum computation on CPU
			gpuErrchk( cudaMemcpy( rdc, rdc_d, dimGrid1.x * sizeof(unsigned int), cudaMemcpyDeviceToHost ) );
			gpuErrchk( cudaMemcpy( brdc, brdc_d, dimGrid1.x * sizeof(int), cudaMemcpyDeviceToHost ) );

			for(int ss = 0; ss < dimGrid1.x; ++ss) {
				sum_intensity += rdc[ss];
				num_diff_pixels += brdc[ss];
			}

			// fill vector data
			vec1[iterator] = sum_intensity;
			vec2[iterator] = num_diff_pixels;

			cerr << "."; // report progress

			// free offsets memory
			free(offsets);

			// initialize helper variables
			sum_intensity = 0;
			num_diff_pixels = 0;
			++iterator;
		}

		// free device memory
		gpuErrchk( cudaFree( vx_d ) );
		gpuErrchk( cudaFree( vxr1_d ) );
		gpuErrchk( cudaFree( vxr2_d ) );
		gpuErrchk( cudaFree( rdc_d ) );
		gpuErrchk( cudaFree( brdc_d ) );
		gpuErrchk( cudaFree( diff_d ) );
		gpuErrchk( cudaFree(offsets_d) );

		// replicate the CPU implementation's output using GPU data
		cout << endl << "GPU Implementation Output:" << endl;
		// printf("#objectKey messif.objects.keys.AbstractObjectKey %s\n", filename.c_str());
		printf("Granulo2_intEnd;messif.objects.impl.GranulometryThresholdL1;Granulo2_intSimple;messif.objects.impl.GranulometryThresholdL1;Granulo2_pixEnd;messif.objects.impl.ObjectFloatVectorL1;Granulo2_pixSimple;messif.objects.impl.ObjectFloatVectorL1;Granulo2_intEnd_deriv;messif.objects.impl.ObjectFloatVectorL1;\n");

		// vypis normalizovanej sumy intenzit na poslednu hodnotu
		for (int i=0; i < numberOfOpenings-1; i++) cout << (double)vec1[i]/vec1[iterator-1] << ",";
		cout << (double)vec1[numberOfOpenings-1]/vec1[iterator-1] << endl;

		// vypis normalizovanej sumy intenzit na sumu intenzit
		for (int i=0; i < numberOfOpenings-1; i++) cout << (double)vec1[i]/orig_intensity_sum << ",";
		cout << (double)vec1[numberOfOpenings-1]/orig_intensity_sum << endl;

		// vypis normalizovaneho poctu pixelov na poslednu hodnotu
		for (int i=0; i < numberOfOpenings-1; i++) cout << (double)vec2[i]/vec2[iterator-1] << ",";
		cout << (double)vec2[numberOfOpenings-1]/vec2[iterator-1] << endl;

		// vypis normalizovaneho poctu pixelov na pocet nenulovych pixelov
		for (int i=0; i < numberOfOpenings-1; i++) cout << (double)vec2[i]/nonzero_pixels << ",";
		cout << (double)vec2[numberOfOpenings-1]/nonzero_pixels << endl;

		// vypis diferencii normalizovanej sumy intenzit na poslednu hodnotu
		for (int i=1; i < 5; i++) cout << (double)vec1[i]/vec1[iterator-1] - (double)vec1[i-1]/vec1[iterator-1] << ",";
		cout << (double)vec1[5]/vec1[iterator-1] - (double)vec1[4]/vec1[iterator-1] << endl;

		return;
	}

	// call this from outside
	void GranuloGpuGray8(Image3d<GRAY8> img, const unsigned int numberOfOpenings) {
		GranuloGpu<GRAY8>(img, numberOfOpenings);
		return;
	}

	// call this from outside
	void GranuloGpuGray16(Image3d<GRAY16> img, const unsigned int numberOfOpenings) {
		GranuloGpu<GRAY16>(img, numberOfOpenings);
		return;
	}

	// call this from outside
	Image3d<GRAY8> OpenGpuGray8(Image3d<GRAY8> img, const unsigned int radius) {
		return OpenGpu<GRAY8>(img, radius);
	}

	// call this from outside
	Image3d<GRAY16> OpenGpuGray16(Image3d<GRAY16> img, const unsigned int radius) {
		return OpenGpu<GRAY16>(img, radius);
	}
}
